from tsnmf_edit import TSNMF
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import joblib
import matplotlib as mpl
from scipy.sparse import load_npz
import ujson as json
import seaborn as sns
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer
# para MinMax Scaler
sup_min_bubble_size = 0
sup_max_bubble_size = 70
sns.set(style='ticks', palette='magma', context='notebook', font='Linux Biolinum O')
#%config InlineBackend.figure_format = 'retina'
%matplotlib inline
def normalize_dataframe_rows(df):
df = pd.DataFrame(normalize(df, norm='l1'), index=df.index, columns=df.columns)
return df
def plot_component(column, **kwargs):
df = kwargs.pop('data')
ax = plt.gca()
#urban_context.plot(ax=ax, facecolor='#efefef', edgecolor='#cfcfcf')
#highways.plot(ax=ax, color='#333333', linewidth=1)
#primary.plot(ax=ax, color='orange', linewidth=1)
#cycleways.plot(ax=ax, color='lime', linewidth=1)
scaler = kwargs.pop('scaler', None)
sizes = scaler.transform(np.sqrt(df[column].values.reshape(-1, 1)))
ax.scatter(df.geometry.x, df.geometry.y, s=sizes, color='#4B0082', alpha=0.5, zorder=20)
plt.axis('off')
plt.axis('equal')
RM_map = gpd.read_file('maps/R13/LIMITE_URBANO_CENSAL_C17.shp')
RM_map['COMUNA'] = RM_map['COMUNA'].astype(np.int)
RM_map = RM_map.set_index('COMUNA')
# Descarto algunas zonas
RM_map2 = RM_map[RM_map.NOM_CATEG=='CIUDAD']
RM_map = RM_map2[~RM_map2.NOM_COMUNA.isin(['CURACAVÍ', 'TILTIL', 'SAN JOSÉ DE MAIPO', 'PIRQUE', 'EL MONTE','TALAGANTE','MELIPILLA', 'LAMPA', 'COLINA', 'CALERA DE TANGO', 'PEÑAFLOR', 'BUIN', 'ISLA DE MAIPO', 'PAINE'])]#rm_map[rm_map.NOM_COMUNA==rm_map.URBANO]
RM_map = RM_map[~RM_map.URBANO.isin(['CIUDAD DEL VALLE'])]
RM_map = gpd.read_file('maps/R13/LIMITE_URBANO_CENSAL_C17.shp')
RM_map['COMUNA'] = RM_map['COMUNA'].astype(np.int)
RM_map = RM_map.set_index('COMUNA')
# Descarto algunas zonas
RM_map2 = RM_map[RM_map.NOM_CATEG=='CIUDAD']
RM_map = RM_map2[~RM_map2.NOM_COMUNA.isin(['CURACAVÍ', 'TILTIL', 'PADRE HURTADO', 'SAN JOSÉ DE MAIPO', 'PIRQUE', 'EL MONTE','TALAGANTE','MELIPILLA', 'COLINA', 'CALERA DE TANGO', 'PEÑAFLOR', 'BUIN', 'ISLA DE MAIPO', 'PAINE'])]#rm_map[rm_map.NOM_COMUNA==rm_map.URBANO]
RM_map = RM_map[~RM_map.URBANO.isin(['CIUDAD DEL VALLE', 'BATUCO', 'CHICAUMA', 'LAMPA'])]
RM_map.boundary.plot(color='black', alpha=0.2)
Al hacer prediciones del mnlogit model, el input necesita tener las features correspondientes a la formula utilizada en EOD_Model_v2.ipynb.
Por ahora zone_income es el único modificable
# Importamos el id, area, la comuna a que pertenece y la geometria
zones = gpd.read_file('json/urban_zones_2016.json').set_index('ID').to_crs({'init': 'epsg:5361'})
# Distancias entre un origen y un destino
zone_distances = pd.read_csv('2017_results/zone_distances.csv.gz', index_col=['ZonaOrigen', 'ZonaDestino'])
# Periodo en el que se realizo el viaje
zone_matrix = pd.read_json('2017_results/od_matrix_per_zone.json.gz', lines=True)
period_matrices = zone_matrix.groupby(['period', 'origin_zone', 'destination_zone'])['trip_count'].mean()
# Media de Ingresos por Hogar segun zona
zone_income = pd.read_csv('2017_results/zone_income.csv.gz', index_col='Zona')
zone_mindist = np.sqrt(pd.read_csv('2020_results/mindist_zonahogar.csv', index_col='Zona'))
Al usar TSNMF el modelo utiliza los waypoints. En el resultado final la matriz W almacena las torres.
# Torres de telefonía ya etiquetadas
towers = gpd.read_file('2017_results/towers_with_labels.geo.json').set_index('tower')
# Puntos de referencia entre zonas
waypoints = load_npz('2017_results/waypoints_between_zones.npz')
waypoint_index = pd.read_json('2017_results/waypoints_between_zones_index.json.gz', lines=True)
periods = waypoint_index.period.unique()
idx_to_tower = dict(zip(range(len(towers)), towers.index.values))
tower_idx = dict(zip(towers.index.values, range(len(towers))))
def find_tower_labels(keys):
return set(towers[keys].sum(axis=1)
.pipe(lambda x: x[x > 0].copy())
.index.values)
rail_keys = find_tower_labels(['within_metro', 'near_surface_metro', 'near_train'])
bus_keys = find_tower_labels(['near_bus_corridor', 'near_bus_routes'])
motorized_keys = find_tower_labels(['near_highways', 'near_primary_streets', 'near_secondary_streets'])
non_motorized_keys = find_tower_labels(['near_cycleways'])
taxi_keys = find_tower_labels(['near_share_taxi'])
labeled_keys = {
'bus': bus_keys, # - (motorized_keys | rail_keys),
'rail': rail_keys, # - (bus_keys | motorized_keys),
'private': motorized_keys, # - (bus_keys | taxi_keys | rail_keys | non_motorized_keys),
'shared_taxi': taxi_keys, # - (motorized_keys | rail_keys | non_motorized_keys)
'non_motorized': non_motorized_keys# - (bus_keys | rail_keys | motorized_keys | pedestrian)
}
topic_ids = dict(zip(labeled_keys.keys(), range(len(labeled_keys))))
labels = []
for idx in tower_idx:
#print(idx)
tower_label = []
for topic_name, topic_id in topic_ids.items():
if idx in labeled_keys[topic_name]:
tower_label.append(topic_id)
labels.append(tower_label)
colors = sns.color_palette('Set2', n_colors=len(labeled_keys)+2)
colors = ['green', 'pale red', 'windows blue', 'dusty purple', 'amber']
colors =sns.xkcd_palette(colors)
color_map = dict(zip(labeled_keys.keys(), colors))
plt.figure(figsize=(12,12))
ax1 = plt.subplot2grid((5,5), (0,0), colspan=4, rowspan=5)
ax1.axis('off')
ax_dict = {'all' : ax1}
for i, k in enumerate(labeled_keys.keys()):
ax_dict[k] = plt.subplot2grid((5,5), (i,4))
ax_dict[k].axis('off')
for label, tower_ids in labeled_keys.items():
towers.loc[tower_ids].plot(ax=ax1, alpha=1, marker='.', markersize=100, zorder=30,
linewidth=1, label=label, color=color_map[label])
ax1.set_title("Towers in RM")
ax1.legend(loc=3)
RM_map.boundary.plot(color='gray', alpha=0.2, ax=ax1)
towers.loc[tower_ids].plot(ax=ax_dict[label], alpha=1, marker='.', markersize=10, zorder=30,
linewidth=1, label=label, color=color_map[label])
ax_dict[label].set_title(label)
RM_map.boundary.plot(color='black', alpha=0.2, ax=ax_dict[label], linewidth=0.5)
Dicho modelo proviene de EOD_Model.ipynb
with open('2020_results/mnlogit_column_names.json') as f:
mnlogit_column_names = json.load(f)
#mnlogit_column_names[-1] = 'shared_taxi'
mnlogit = joblib.load('2020_results/mnlogit_zone_model_instance.joblib.gz')
mnlogit_column_names
a = TfidfTransformer(norm='l1').fit_transform(waypoints[waypoint_periods.index]).T
a
Ejecutamos la iteración por periodo
i=0
for period in periods:
#if period!='afternoon_peak': break
print("Actual period:", period)
# work only on this period
waypoint_periods = waypoint_index[waypoint_index.period == period]
# we use a binary matrix this time! we are interested in routes, not in magnitudes
# Should we use TfidfTransformer(norm='l1')?
to_factorize = csr_matrix(waypoints[waypoint_periods.index].T, copy=True)
to_factorize[to_factorize > 0] = 1
# these are the static values used to evaluate the prior from EOD
zone_features = (waypoint_periods
.join(zone_income, on='origin_zone', how='inner')
.rename(columns={'mean_home_income': 'origin_income'})
.join(zone_income, on='destination_zone', how='inner')
.rename(columns={'mean_home_income': 'destination_income'})
.join(zone_distances, on=['origin_zone', 'destination_zone'], how='inner')
.join(zone_mindist.add_prefix('origen_'), on='origin_zone', how='inner')
.join(zone_mindist.add_prefix('destino_'), on='destination_zone', how='inner')
.pipe(lambda x: x[x['origin_zone']!=x['destination_zone']])
)
# perform the factorization
nmf = TSNMF(n_components=len(topic_ids), init='random', verbose=1, tol=1e-06, random_state=6, max_iter=500)
W = nmf.fit_transform(to_factorize, labels)
tower_mode = pd.DataFrame(normalize(W, norm='l1') , index=towers.index.tolist(), columns= labeled_keys.keys())
tower_mode.index.name = 'tower'
# plot the tower factorization for inspection
sup_tower_components = (tower_mode
.reset_index()
.pipe(lambda x: pd.melt(x, id_vars='tower')))
sup_scaler = MinMaxScaler(feature_range=(sup_min_bubble_size, sup_max_bubble_size))
sup_scaler.fit(np.sqrt(sup_tower_components.value.values.reshape(-1, 1)))
sup_geodf = gpd.GeoDataFrame(sup_tower_components.join(towers, on='tower'), crs={'init': 'epsg:4326'})
g = sns.FacetGrid(data=sup_geodf, col='variable', height=7, aspect=1, sharex=False, sharey=False)
for ax in g.axes.ravel():
RM_map.boundary.plot(color='black', alpha=0.2, ax=ax)
g.map_dataframe(plot_component, 'value', scaler=sup_scaler)
# retrieve the zone-mode matrix
flow_mode = (pd.DataFrame(nmf.components_.T ,
index=waypoint_periods.index,
columns= labeled_keys.keys())
.join(waypoint_periods)
.set_index(['origin_zone', 'destination_zone'])
.drop(['index', 'period'], axis=1)
)
# estimate a probability for non motorized trips.
# we use a logistic function. the 0.5 prob for non motorized trips is assigned at 250 meters.
pedestrian_p = 1 - 1 / (1 + np.exp(-(zone_distances.loc[flow_mode.index].distance - 0.25)))
pedestrian_p.name = 'pedestrian'
# CHECK THIS
intermodal_private_p = (flow_mode['bus'] + flow_mode['rail'] + flow_mode['bus']*flow_mode['rail'])*flow_mode['shared_taxi']*0.5
intermodal_public_p = (flow_mode['bus']*flow_mode['rail'])*0.5
flow_evidence = (flow_mode
.pipe(normalize_dataframe_rows)
.mul(1 - pedestrian_p.values, axis='rows')
.assign(pedestrian=pedestrian_p.values)
.assign(intermodal_private=intermodal_private_p.values)
.assign(intermodal_public=intermodal_public_p.values)
.pipe(normalize_dataframe_rows)
)
#for col in mnlogit_column_names:
# if not col in flow_evidence:
# flow_evidence[col] = 0
flow_evidence = flow_evidence.loc[zone_features.set_index(['origin_zone', 'destination_zone']).index]
flow_evidence = flow_evidence.reindex(mnlogit_column_names, axis=1)
# plot the evidence modal partition for inspection
flow_evidence_trips = flow_evidence.mul(period_matrices.loc[period], axis='index')
(flow_evidence_trips
.join(zones.Comuna, on='origin_zone')
.groupby('Comuna')
.sum()
.pipe(normalize_dataframe_rows)
.sort_values('bus')
.plot(kind='bar', stacked=True, figsize=(14, 7), cmap='viridis', width=0.9, legend=False)
)
plt.title('Evidence')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), frameon=False)
sns.despine()
# estimate the prior information using the EOD mnlogit model
zone_priors = mnlogit.predict(zone_features.set_index(['origin_zone', 'destination_zone']))
zone_priors.columns = mnlogit_column_names
flow_priors = zone_priors.loc[flow_evidence.index]
# plot the prior partition for inspection
flow_prior_trips = flow_priors.mul(period_matrices.loc[period], axis='index')
(flow_prior_trips
.join(zones.Comuna, on='origin_zone')
.groupby('Comuna')
.sum()
.pipe(normalize_dataframe_rows)
.sort_values('bus')
.plot(kind='bar', stacked=True, figsize=(14, 7), cmap='viridis', width=0.9, legend=False)
)
plt.title('Prior')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), frameon=False)
sns.despine()
# use bayes theorem to estimate the posterior modal partition
denominator = flow_priors.mul(flow_evidence).sum(axis=1)
flow_posteriors = flow_evidence.mul(flow_priors).div(denominator, axis='index')
# plot the posterior partition for inspection
flow_trips = flow_posteriors.mul(period_matrices.loc[period], axis='index')
(flow_trips
.join(zones.Comuna, on='origin_zone')
.groupby('Comuna')
.sum()
.pipe(normalize_dataframe_rows)
.sort_values('bus')
.plot(kind='bar', stacked=True, figsize=(14, 7), cmap='viridis', width=0.9, legend=False)
)
plt.title('Posterior')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), frameon=False)
sns.despine()
# export data
raw_tower_mode = pd.DataFrame(W , index=towers.index.tolist(), columns= labeled_keys.keys())
raw_tower_mode.index.name = 'tower'
raw_tower_mode.to_csv('2020_results/flow_model_{}_tower_mode.csv.gz'.format(period))
flow_mode.to_csv('2020_results/flow_model_{}_zone_mode.csv.gz'.format(period))
(flow_posteriors
.reset_index()
.assign(period=period)
.to_json('2020_results/modal_partition_zones_{}.json.gz'.format(period),
compression='gzip', orient='records', lines=True)
)
# done!
plt.show()
i+=1
print("Done...")